{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "Z12nIL0GmtKF"
   },
   "source": [
    "# Machine learning for Credit Card Transactions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "Plnw0bRc_Mks"
   },
   "outputs": [],
   "source": [
    "import os\n",
    "import math\n",
    "import numpy as np\n",
    "import networkx as nx\n",
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline\n",
    "\n",
    "default_edge_color = 'gray'\n",
    "default_node_color = '#407cc9'\n",
    "enhanced_node_color = '#f5b042'\n",
    "enhanced_edge_color = '#cc2f04'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "Vye1SbKAnI_A"
   },
   "source": [
    "## Load Dataset and build graph"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "df = pd.read_csv(\"/Users/claudiostamile/Downloads/fraudTrain.csv\")\n",
    "df = df[df[\"is_fraud\"]==0].sample(frac=0.20, random_state=42).append(df[df[\"is_fraud\"] == 1])\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df[\"is_fraud\"].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def build_graph_bipartite(df_input, graph_type=nx.Graph()):\n",
    "    df = df_input.copy()\n",
    "    mapping = {x:node_id for node_id,x in enumerate(set(df[\"cc_num\"].values.tolist() + df[\"merchant\"].values.tolist()))}\n",
    "    df[\"from\"] = df[\"cc_num\"].apply(lambda x: mapping[x])\n",
    "    df[\"to\"] = df[\"merchant\"].apply(lambda x: mapping[x])\n",
    "    df = df[['from', 'to', \"amt\", \"is_fraud\"]].groupby(['from', 'to']).agg({\"is_fraud\": \"sum\", \"amt\": \"sum\"}).reset_index()\n",
    "    df[\"is_fraud\"] = df[\"is_fraud\"].apply(lambda x: 1 if x>0 else 0)\n",
    "    \n",
    "    G = nx.from_edgelist(df[[\"from\", \"to\"]].values, create_using=graph_type)\n",
    "    \n",
    "    nx.set_node_attributes(G,{x:1 for x in df[\"from\"].unique()}, \"bipartite\")\n",
    "    nx.set_node_attributes(G,{x:2 for x in df[\"to\"].unique()}, \"bipartite\")\n",
    "    \n",
    "    nx.set_edge_attributes(G, \n",
    "                       {(int(x[\"from\"]), int(x[\"to\"])):x[\"is_fraud\"] for idx, x in df[[\"from\",\"to\",\"is_fraud\"]].iterrows()}, \n",
    "                       \"label\")\n",
    "\n",
    "    nx.set_edge_attributes(G, \n",
    "                       {(int(x[\"from\"]), int(x[\"to\"])):x[\"amt\"] for idx, x in df[[\"from\",\"to\",\"amt\"]].iterrows()}, \n",
    "                       \"weight\")\n",
    "    return G"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def build_graph_tripartite(df_input, graph_type=nx.Graph()):\n",
    "    df = df_input.copy()\n",
    "    mapping = {x:node_id for node_id,x in enumerate(set(df.index.values.tolist() + \n",
    "                                                        df[\"cc_num\"].values.tolist() + \n",
    "                                                        df[\"merchant\"].values.tolist()))}\n",
    "    df[\"in_node\"] = df[\"cc_num\"].apply(lambda x: mapping[x])\n",
    "    df[\"out_node\"] = df[\"merchant\"].apply(lambda x: mapping[x])\n",
    "\n",
    "    G = nx.from_edgelist([(x[\"in_node\"], mapping[idx]) for idx, x in df.iterrows()] +\n",
    "                         [(x[\"out_node\"], mapping[idx]) for idx, x in df.iterrows()], \n",
    "                         create_using=graph_type)\n",
    "\n",
    "    nx.set_node_attributes(G,{x[\"in_node\"]:1 for idx,x in df.iterrows()}, \"bipartite\")\n",
    "    nx.set_node_attributes(G,{x[\"out_node\"]:2 for idx,x in df.iterrows()}, \"bipartite\")\n",
    "    nx.set_node_attributes(G,{mapping[idx]:3 for idx, x in df.iterrows()}, \"bipartite\")\n",
    "\n",
    "    nx.set_edge_attributes(G,{(x[\"in_node\"], mapping[idx]):x[\"is_fraud\"] for idx, x in df.iterrows()}, \"label\")\n",
    "    nx.set_edge_attributes(G,{(x[\"out_node\"], mapping[idx]):x[\"is_fraud\"] for idx, x in df.iterrows()}, \"label\")\n",
    "\n",
    "    nx.set_edge_attributes(G,{(x[\"in_node\"], mapping[idx]):x[\"amt\"] for idx, x in df.iterrows()}, \"weight\")\n",
    "    nx.set_edge_attributes(G,{(x[\"out_node\"], mapping[idx]):x[\"amt\"] for idx, x in df.iterrows()}, \"weight\")\n",
    "    return G"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "G = build_graph_bipartite(df, nx.Graph())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from networkx.algorithms import bipartite\n",
    "bipartite.is_bipartite(G)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.figure(figsize=(10,10))\n",
    "top = nx.bipartite.sets(G)[0]\n",
    "pos = nx.bipartite_layout(G, top)\n",
    "nx.draw(G, pos=pos, with_labels=False, node_color=default_node_color, edge_color=default_edge_color)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.axis(\"off\")\n",
    "plt.figure(figsize=(10,10))\n",
    "\n",
    "nx.draw_networkx(G, pos=spring_pos, node_color=default_node_color, \n",
    "                 edges_color=default_edge_color, with_labels=False, node_size=15)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "2z2PCthzneat"
   },
   "source": [
    "## Network Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(nx.info(G))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.figure(figsize=(10,10))\n",
    "degrees = pd.Series({k: v for k, v in nx.degree(G)})\n",
    "degrees.plot.hist()\n",
    "plt.yscale(\"log\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "allEdgesWeights = pd.Series({(d[0], d[1]): d[2][\"weight\"] for d in G.edges(data=True)})\n",
    "np.quantile(allEdgesWeights.values,[0.10,0.50,0.70,0.9,1.0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "quant_dist = np.quantile(allEdgesWeights.values,[0.10,0.50,0.70,0.9])\n",
    "quant_dist"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "allEdgesWeightsFiltered = pd.Series({(d[0], d[1]): d[2][\"weight\"] for d in G.edges(data=True) \n",
    "                                     if d[2][\"weight\"] < quant_dist[-1]})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.figure(figsize=(10,10))\n",
    "allEdgesWeightsFiltered.plot.hist(bins=40)\n",
    "plt.yscale(\"log\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.figure(figsize=(10,10))\n",
    "bC = nx.betweenness_centrality(G)\n",
    "bc_distr = pd.Series(bC)\n",
    "bc_distr.plot.hist()\n",
    "plt.yscale(\"log\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "gPMC9VDyuF5F",
    "outputId": "871111c8-12b4-4820-8675-f74fccdd39e6",
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "np.mean(list(bC.values()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "94viGU4vserg",
    "outputId": "ea65df57-df57-4e51-f396-9c808f274766"
   },
   "outputs": [],
   "source": [
    "# degree centrality\n",
    "plt.figure(figsize=(10,10))\n",
    "deg_C = nx.degree_centrality(G)\n",
    "degc_distr = pd.Series(deg_C)\n",
    "degc_distr.plot.hist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "vLp2CBJHtC1d",
    "outputId": "c1ad4b5d-4d77-4ac1-84ce-6210fd7dc11f"
   },
   "outputs": [],
   "source": [
    "# closeness centrality\n",
    "plt.figure(figsize=(10,10))\n",
    "clos_C = nx.closeness_centrality(G)\n",
    "closc_distr = pd.Series(clos_C)\n",
    "closc_distr.plot.hist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.mean(list(clos_C.values()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "MQOah_yDtbaW",
    "outputId": "558fc1ea-f457-4386-b5ce-8dc61f8f0115"
   },
   "outputs": [],
   "source": [
    "# assortativity\n",
    "nx.degree_pearson_correlation_coefficient(G)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "c8peWeN9nh1m"
   },
   "source": [
    "### Community Detection"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import community\n",
    "\n",
    "parts = community.best_partition(G, random_state=42, weight='weight')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "communities = pd.Series(parts)\n",
    "communities.value_counts().sort_values(ascending=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "graphs = []\n",
    "d = {}\n",
    "for x in communities.unique():\n",
    "    tmp = nx.subgraph(G, communities[communities==x].index)\n",
    "    fraud_edges = sum(nx.get_edge_attributes(tmp, \"label\").values())\n",
    "    ratio = 0 if fraud_edges == 0 else (fraud_edges/tmp.number_of_edges())*100\n",
    "    d[x] = ratio\n",
    "    graphs += [tmp]\n",
    "\n",
    "pd.Series(d).sort_values(ascending=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "gId = 10\n",
    "plt.figure(figsize=(10,10))\n",
    "spring_pos = nx.spring_layout(graphs[gId])\n",
    "plt.axis(\"off\")\n",
    "edge_colors = [\"r\" if x == 1 else \"g\" for x in nx.get_edge_attributes(graphs[gId], 'label').values()]\n",
    "nx.draw_networkx(graphs[gId], pos=spring_pos, node_color=default_node_color, \n",
    "                 edge_color=edge_colors, with_labels=False, node_size=15)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Supervised Learning"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.utils import resample\n",
    "\n",
    "df_majority = df[df.is_fraud==0]\n",
    "df_minority = df[df.is_fraud==1]\n",
    "\n",
    "df_maj_dowsampled = resample(df_majority,\n",
    "                             n_samples=len(df_minority),\n",
    "                             random_state=42)\n",
    "\n",
    "df_downsampled = pd.concat([df_minority, df_maj_dowsampled])\n",
    "\n",
    "print(df_downsampled.is_fraud.value_counts())\n",
    "G_down = build_graph_bipartite(df_downsampled)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "\n",
    "train_edges, test_edges, train_labels, test_labels = train_test_split(list(range(len(G_down.edges))), \n",
    "                                                                      list(nx.get_edge_attributes(G_down, \"label\").values()), \n",
    "                                                                      test_size=0.20, \n",
    "                                                                      random_state=42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "edgs = list(G_down.edges)\n",
    "train_graph = G_down.edge_subgraph([edgs[x] for x in train_edges]).copy()\n",
    "train_graph.add_nodes_from(list(set(G_down.nodes) - set(train_graph.nodes)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from node2vec import Node2Vec\n",
    "from node2vec.edges import HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder\n",
    "\n",
    "node2vec_train = Node2Vec(train_graph, weight_key='weight')\n",
    "model_train = node2vec_train.fit(window=10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.ensemble import RandomForestClassifier \n",
    "from sklearn import metrics \n",
    "\n",
    "classes = [HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder]\n",
    "for cl in classes:\n",
    "    embeddings_train = cl(keyed_vectors=model_train.wv) \n",
    "\n",
    "    train_embeddings = [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in train_edges]\n",
    "    test_embeddings = [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in test_edges]\n",
    "    \n",
    "    rf = RandomForestClassifier(n_estimators=1000, random_state=42) \n",
    "    rf.fit(train_embeddings, train_labels); \n",
    "\n",
    "    y_pred = rf.predict(test_embeddings)\n",
    "    print(cl)\n",
    "    print('Precision:', metrics.precision_score(test_labels, y_pred)) \n",
    "    print('Recall:', metrics.recall_score(test_labels, y_pred)) \n",
    "    print('F1-Score:', metrics.f1_score(test_labels, y_pred)) "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "R4Vk5GnxcWF2"
   },
   "source": [
    "## Unupervised Learning"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "nod2vec_unsup = Node2Vec(G_down, weight_key='weight')\n",
    "unsup_vals = nod2vec_unsup.fit(window=10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.cluster import KMeans\n",
    "\n",
    "classes = [HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder]\n",
    "true_labels = [x for x in nx.get_edge_attributes(G_down, \"label\").values()]\n",
    "\n",
    "for cl in classes:\n",
    "    embedding_edge = cl(keyed_vectors=unsup_vals.wv) \n",
    "\n",
    "    embedding = [embedding_edge[str(x[0]), str(x[1])] for x in G_down.edges()]\n",
    "    kmeans = KMeans(2, random_state=42).fit(embedding)\n",
    "    \n",
    "    \n",
    "    nmi = metrics.adjusted_mutual_info_score(true_labels, kmeans.labels_)\n",
    "    ho = metrics.homogeneity_score(true_labels, kmeans.labels_)\n",
    "    co = metrics.completeness_score(true_labels, kmeans.labels_)\n",
    "    vmeasure = metrics.v_measure_score(true_labels, kmeans.labels_)\n",
    "    \n",
    "    print(cl)\n",
    "    print('NMI:', nmi)\n",
    "    print('Homogeneity:', ho)\n",
    "    print('Completeness:', co)\n",
    "    print('V-Measure:', vmeasure)"
   ]
  }
 ],
 "metadata": {
  "colab": {
   "collapsed_sections": [],
   "name": "socialNetwork.ipynb",
   "provenance": [],
   "toc_visible": true
  },
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
